library(feather)
library(xtable)
library(fuzzyjoin)
library(dplyr)
library(data.table)
library(RSQLite)
library(zoo)
library(xlsx)
library(stringdist)
library(stringr)
library(tm)
library(tabulizer)
library(lattice)
library(latticeExtra)
library(extrafont)
library(RColorBrewer)
library(classInt)
library(grid)
library(rgdal)
library(raster)
library(spatstat)
require(readstata13)
library(stringr)
require("RecordLinkage")
library(stringdist)
library(fuzzyjoin)
library(dplyr)
require(linked.table)
library(pastecs)

trim <- function (x) gsub("^\\s+|\\s+$", "", x)

rm(list=ls())

##############################################################
### Match NAPP to linked ICEM based on HHNUMhead social    ###
### Identify father's occupation and brother

## Datasets used: 
## (1) Dataset_1851_1881
## (2) dictionary_final3.RData
## (3) MolaModified_NAPP_JAG4new.dta

## Datasets created: 
## (1) father.RData
## (2) sibling.RData

## Output created: 
## Descriptives : Table C1 in the Appendix
## Transition matrix: Table F6

##############################################################
# Folder A_CREATING_SAMPLE
fldr_a = "/Users/myra/Dropbox/NetworkParish2016/Occupational_Choice/Submission/Final_ReStat/Codes/A_CREATING_SAMPLE"

# Folder C_ANALYSIS
fldr_c = "/Users/myra/Dropbox/NetworkParish2016/Occupational_Choice/Submission/Final_ReStat/Codes/C_ANALYSIS/"

## NAPP Folder
fldr_napp = "/Users/myra/Dropbox/NetworkParish2016/Occupational_Choice/Submission/Final_ReStat/Codes/NAPP/"

####################################################################
### Table F6. Transition tables with linked census 
### with age restriction and using father's occupation from 1851 ###
####################################################################

dat <- read_feather(file.path(fldr_a, "Dataset_1851_1881")
                    
setDT(dat)
dat <- dat[!is.na(cat) & !is.na(father_cat_1851) & Age %in% 43:49]

dat[, Flfocc := factor(cat, c("professional", "commercial", "artisan", "builder", "food", "service"))]
dat[, father_Flfocc := factor(father_cat_1851, c("professional", "commercial", "artisan", "builder", "food", "service"))]

tab1 <- prop.table(table(dat[, .(Flfocc, father_Flfocc)]), margin = 2)
tab2 <- table(dat[, .(Flfocc, father_Flfocc)])

temp1 <- format(round(tab1, 3), nsmall = 3)
temp2 <- tab2
odd <- seq(1, nrow(temp1) + nrow(temp2) - 1, 2)
even <- odd + 1

for(i in 1:nrow(tab1)) {
  
  ##Signal the largest probability:
  j <- which.max(tab1[, i])
  temp1[j, i] <- paste0("\\textbf{", temp1[j, i], "}")
  
  for(r in 1:nrow(tab2))
    temp2[r, i] <- paste0("\\{", temp2[r, i], "\\}")
  
  temp2[j, i] <- paste0("\\textbf{", temp2[j, i], "}")
  
}


temp <- matrix(, nrow = nrow(temp1) + nrow(temp2), ncol = ncol(temp1))
temp[odd, ] <- temp1
temp[even, ] <- temp2

cells <- colnames(tab1)
cells_spc <- rep("", nrow(temp))
cells_spc[odd] <- colnames(tab1)

tab <- cbind(cells_spc, temp)
tab <- rbind(c("Son", cells), tab)

rowTotal <- matrix("", ncol = 1, nrow = nrow(tab))
rowTotal[seq(1, nrow(tab), 2)] <- c("Total", apply(tab2, MARGIN = 1, sum))
tab <- cbind(tab, rowTotal)

colTotal <- matrix("", nrow = 1, ncol = ncol(tab))
colTotal <- c("\\midrule Total", apply(tab2, MARGIN = 2, sum), sum(tab2))
tab <- rbind(tab, colTotal)

x <- suppressWarnings(xtable(tab))

align(x) <- c(rep("l", 2),rep("c", ncol(tab) - 2), "r")
addtorow <- list()
addtorow$pos <- list()
addtorow$pos[[1]] <- 1
addtorow$command <- c("\\midrule ")

out <- print(x,
             sanitize.colnames.function = identity,
             add.to.row = addtorow,
             sanitize.text.function = identity,
             include.rownames = FALSE,
             include.colnames = FALSE,
             floating.environment = "threeparttable")

out <- gsub("(\\\\end\\{threeparttable\\})", "\\\\footnotesize Individuals that were 13-19 when father occupation was measured. Father occupation measured at previous Census \\1", out)
out <- gsub("(\\\\hline\\n)  (\\\\hline)", "\\1  & \\\\multicolumn\\{4\\}\\{c\\}\\{Father\\} \\\\\\\\ \\\\cmidrule(){2-5}", out)

cat(out, file = "transition_table1851_son_13-19.tex")

######################################################################
### 1. Match between linked 1851-1881 ICeM data and 1881 NAPP data 
### to get fathers' occupation
######################################################################

### 1. Clean linked 1851-1881 data ###

dat <- read_feather(file.path(fldr_a, "Dataset_1851_1881")
dat <- data.frame(dat[order(dat$person_idH),])
dat <- dat[dat$Year==1881,] # Keep only people from 1881

# Determine occupation of father by using 1851 census OR 1881 census
dat$Occ_father_all <- dat$prev_father_Occ 
dat$Occ_father_all <- ifelse(is.na(dat$Occ_father_all) | dat$Occ_father_all=="" | dat$Occ_father_all==" ", dat$father_Occ, dat$Occ_father_all)

# Add occupation category to sons and fathers
load(file = paste(fldr_c, "dictionary_final3.RData", sep=""))
dictionary <- unique(dictionary[, c("cat", "Occ")])
dat <- merge(dat, dictionary, by="Occ", all.x=TRUE)
names(dictionary) <- c("father_cat", "Occ")
dat <- merge(dat, dictionary, by.x="Occ_father_all", by.y="Occ", all.x=TRUE)
names(dictionary) <- c("father_cat_1881", "Occ")
dat <- merge(dat, dictionary, by.x="father_Occ", by.y="Occ", all.x=TRUE)
names(dictionary) <- c("father_cat_1851", "Occ")
dat <- merge(dat, dictionary, by.x="prev_father_Occ", by.y="Occ", all.x=TRUE)

dat <- dat[, c("Sname", "father_Sname", "H_Sname", "Pname", "Oname", "Age", "Occ", "Parish", "ParID", "Std_Par", "BpCnty", "Address", "stid", "hh_id", "H", "father_cat", "father_cat_1881", "father_cat_1851", "father_Age", "prev_father_Age", "prev_father_Occ", "father_Occ", "Occ_father_all", "person_idH", "cat", "BpCmty", "Mar")]

# Clean names
dat$Pname <- trim(dat$Pname)
dat$Sname <- trim(dat$Sname)
dat$Fname <- sapply(strsplit(as.character(dat$Pname),' '), "[", 1)
dat$Mname <- sapply(strsplit(as.character(dat$Pname),' '), "[", 2)
dat$Fname <- trim(dat$Fname)
dat$Mname <- trim(dat$Mname)
dat$Sname <- gsub("[^A-Za-z ]","",dat$Sname)
dat$Sname <- gsub('[[:punct:] ]+',' ',dat$Sname)
dat$Fname <- gsub("[^A-Za-z ]","",dat$Fname)
dat$Fname <- gsub('[[:punct:] ]+',' ',dat$Fname)

# Clean address
dat$Address <- tolower(dat$Address)
dat$Address <- gsub("[^A-Za-z ]","",dat$Address)
dat$Address <- trim(dat$Address)
dat$Parish <- as.character(dat$Parish)
dat$Address <- as.character(dat$Address)

# remove missing occupation 
dat <- dat[dat$cat!="", ]

# married
dat$married <- ifelse(dat$Mar==2 | dat$Mar==3, 1, 0)

# Clean parish of birth
dat$BpCmty <- gsub("PANCRAS", "ST PANCRAS", dat$BpCmty)
dat$BpCmty <- gsub("BETHNAL G", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("MARYLEBONE", "ST MARYLEBONE", dat$BpCmty)
dat$BpCmty <- gsub("ROTHERHE", "ROTHERHITHE", dat$BpCmty)
dat$BpCmty <- gsub("LONDON BETHNAL GREEN", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("B GREEN", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("BETH GREEN", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("SOUTHW ST OLAVE", "ST OLAVE SOUTHWARK", dat$BpCmty)
dat$BpCmty <- gsub("ST ST PANCRAS", "ST PANCRAS", dat$BpCmty)
dat$BpCmty <- gsub("BETHNAL GREENREEN", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("BETHNAL GREENN", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("ST LEONARDS SHOREDIT", "SHOREDITCH", dat$BpCmty)
dat$Parish <- gsub("ST JAMES WESTMINSTER","WESTMINSTER",dat$Parish)
dat$Parish <- gsub("ST GEORGE THE MARTYR SOUTHWARK", "SOUTHWARK", dat$Parish)
dat$BpCmty <- gsub("ST LUKES", "ST LUKE", dat$BpCmty)
dat$BpCmty <- gsub("ST GEORGES EAST", "ST GEORGE IN THE EAST", dat$BpCmty)
dat$BpCmty <- gsub("ST ST MARYLEBONE", "ST MARYLEBONE", dat$BpCmty)
dat$Parish <- gsub("ST ANDREW HOLBORN ABOVE THE BARS AND ST GEORGE", "HOLBORN", dat$Parish)
dat$Parish <- gsub("LONDON ST MARYLEBONE, LONDON", "ST MARYLEBONE", dat$Parish)
dat$Parish <- gsub("LONDON ST ST PANCRAS, L", "ST PANCRAS", dat$Parish)
dat$BpCmty <- gsub("BETHNAL GREENR","BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("ST GEORGES EAST", "ST GEORGE IN THE EAST", dat$BpCmty)
dat$BpCmty <- gsub("SOUTHK", "ST SAVIOUR SOUTHWARK", dat$BpCmty)
dat$BpCmty <- gsub("STHWARK", "ST SAVIOUR SOUTHWARK", dat$BpCmty)
dat$BpCmty <- gsub("WESTMINSTER ST JAMES", "ST JAMES WESTMINSTER", dat$BpCmty)
dat$BpCmty <- gsub("BMDSY", "BERMONDSEY", dat$BpCmty)
dat$BpCmty <- gsub("BERSY", "BERMONDSEY", dat$BpCmty)
dat$BpCmty <- gsub("ST LEONARDS SHOREDIT", "SHOREDITCH", dat$BpCmty)
dat$BpCmty <- gsub("MILE END", "MILE END OLD TOWN", dat$BpCmty)
dat$Parish <- gsub("ST GILES IN THE FIELDS AND ST GEORGE BLOOMSBURY", "ST GILES FIELDS", dat$Parish)
dat$Parish <- gsub("CHRISTCHURCH SOUTHWARK", "SOUTHWARK", dat$Parish)
dat$BpCmty <- gsub("B'SEA", "BATTERSEA", dat$BpCmty)
dat$BpCmty <- gsub("MILE END", "MILE END OLD TOWN",dat$BpCmty)
dat$BpCmty <- gsub("LONDON ISLINGTON, LO", "ISLINGTON",dat$BpCmty)
dat$BpCmty <- gsub("SHOREH", "SHOREDITCH", dat$BpCmty)
dat$BpCmty <- gsub("ST GEO HAN SQ", "ST GEORGE HANOVER SQUARE", dat$BpCmty)
dat$BpCmty <- gsub("SOUTHWARK", "ST SAVIOUR SOUTHWARK", dat$BpCmty)
dat$BpCmty <- gsub("BERMY", "BERMONDSEY", dat$BpCmty)
dat$BpCmty <- gsub("ST GEORGE HAN SQ", "ST GEORGE HANOVER SQUARE", dat$BpCmty)
dat$Parish <- gsub("ST OLAVE SOUTHWARK", "SOUTHWARK", dat$Parish)
dat$BpCmty <- gsub("CLERKLL", "CLERKENWELL", dat$BpCmty)
dat$Parish <- gsub("ST SAVIOUR SOUTHWARK", "SOUTHWARK", dat$Parish)
dat$Parish <- gsub("ST MARGARET AND ST JOHN THE EVANGELIST WESTMINSTER", "WESTMINSTER", dat$Parish)
dat$BpCmty <- gsub("WESTMR", "WESTMINSTER", dat$BpCmty)
dat$BpCmty <- gsub("BETH GN", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("CHRIST CHURCH", "SOUTHWARK", dat$BpCmty)
dat$BpCmty <- gsub("ST. ST PANCRAS", "ST PANCRAS", dat$BpCmty)
dat$BpCmty <- gsub("WESTMINSTER ST JAMES", "ST JAMES WESTMINSTER", dat$BpCmty)
dat$BpCmty <- gsub("BETHNAL GREENR","BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("LAMBH", "LAMBETH", dat$BpCmty)
dat$BpCmty <- gsub("M E O TOWN", "MILE END OLD TOWN", dat$BpCmty)
dat$BpCmty <- gsub("M E O T", "MILE END OLD TOWN", dat$BpCmty)
dat$Parish <- gsub("ST MARGARET AND ST JOHN THE EVANGELIST WESTMINSTER", "WESTMINSTER", dat$Parish)
dat$BpCmty <- gsub("WESTR","WESTMINSTER", dat$BpCmty)
dat$BpCmty <- gsub("LONDON CLERKL, LONDO", "CLERKENWELL", dat$BpCmty)
dat$BpCmty <- gsub("CLERKLL", "CLERKENWELL", dat$BpCmty)
dat$BpCmty <- gsub("LAMBETT", "LAMBETH", dat$BpCmty)
dat$BpCmty <- gsub("BATTORSEA", "BATTERSEA", dat$BpCmty)
dat$BpCmty <- gsub("BETH GRN", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("LONDON LAMBETH, LOND", "LAMBETH", dat$BpCmty)
dat$BpCmty <- gsub("MILE END N T", "MILE END NEW TOWN", dat$BpCmty)
dat$BpCmty <- gsub("ST GEORGE E", "ST GEORGE IN THE EAST", dat$BpCmty)
dat$BpCmty <- gsub("CLERKEN WELL", "CLERKENWELL", dat$BpCmty)
dat$BpCmty <- gsub("WCHAPEL", "WHITECHAPEL", dat$BpCmty)
dat$BpCmty <- gsub("ST GEORGES EAST", "ST GEORGE IN THE EAST", dat$BpCmty)
dat$BpCmty <- gsub("STHWARK", "SOUTHWARK", dat$BpCmty)
dat$BpCmty <- gsub("SHORDITCH","SHOREDITCH", dat$BpCmty)
dat$BpCmty <- gsub("OLD ST ST LUKE", "ST LUKE", dat$BpCmty)
dat$BpCmty <- gsub("ST MARGT WESTMR", "WESTMINSTER", dat$BpCmty)
dat$BpCmty <- gsub("SGHOREDITCH","SHOREDITCH", dat$BpCmty)
dat$Parish <- gsub("ST SAVIOUR SOUTHWARK", "SOUTHWARK", dat$Parish)
dat$Parish <- gsub("ST ANDREW HOLBORN ABOVE THE BARS AND ST GEORGE THE MARTYR", "HOLBORN", dat$Parish)
dat$BpCmty <- gsub("CAMBLL", "CAMBERWELL", dat$BpCmty)
dat$BpCmty <- gsub("LONDON ISLINGTON, LO", "ISLINGTON", dat$BpCmty)
dat$BpCmty <- gsub("CLERKLL", "CLERKENWELL", dat$BpCmty)
dat$BpCmty <- gsub("ST.LUKES", "ST LUKE", dat$BpCmty)
dat$Parish <- gsub("ST GEORGE THE MARTYR SOUTHWARK", "SOUTHWARK", dat$Parish)
dat$BpCmty <- gsub("LONDON LAMBETH, LOND", "LAMBETH", dat$BpCmty)
dat$BpCmty <- gsub("LONDON ST ST PANCRAS, L", "ST PANCRAS", dat$BpCmty)
dat$BpCmty <- gsub("ROTHRHITHE", "ROTHERHITHE", dat$BpCmty)
dat$BpCmty <- gsub("ST MARTINS", "ST MARTIN IN THE FIELDS", dat$BpCmty)
dat$BpCmty <- gsub("B G", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("BETH GRN", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- gsub("ST.ST PANCRAS","ST PANCRAS", dat$BpCmty)
dat$Parish <- gsub("BOW AKA ST MARY STRATFORD LE BOW", "BOW", dat$Parish)
dat$BpCmty <- gsub("BERMONSEY", "BERMONDSEY", dat$BpCmty)
dat$Parish <- gsub("ST BOTOLPH WITHOUT ALDGATE OR EAST SMITHFIELD", "ALDGATE", dat$Parish)
dat$BpCmty <- gsub("B GN", "BETHNAL GREEN", dat$BpCmty)
dat$BpCmty <- trim(dat$BpCmty)
dat$Parish <- trim(dat$Parish)

# dummy stayers vs movers
dat$stayerp = ifelse(dat$BpCmty==dat$Parish, 1, 0)
dat$moverp = 1 - dat$stayerp

## Table C.1: Sample sizes descriptives ###

length(unique(dat$person_idH)) #  403341
length(unique(dat$person_idH[dat$Age %in% 15:60])) # 257172
# linked to 1851 with cat of father and son
length(unique(dat$person_idH[!is.na(dat$father_cat_1851) & dat$father_cat_1851!="" & dat$Age %in% 15:60])) # 30472
# Share of the same occupation of father and son
length(unique(dat$person_idH[!is.na(dat$father_cat_1851) & dat$father_cat_1851!="" & dat$Occ==dat$prev_father_Occ & dat$Age %in% 15:60])) # 1504
length(unique(dat$person_idH[!is.na(dat$father_cat_1851) & dat$father_cat_1851!="" & dat$Occ!=dat$prev_father_Occ & dat$Age %in% 15:60])) #28968 
# Share of the same occupation cat of father and son
length(unique(dat$person_idH[!is.na(dat$father_cat_1851) & dat$father_cat_1851!="" & dat$cat==dat$father_cat_1851 & dat$Age %in% 15:60])) # 9409
length(unique(dat$person_idH[!is.na(dat$father_cat_1851) & dat$father_cat_1851!="" & dat$cat!=dat$father_cat_1851 & dat$Age %in% 15:60])) #21063

### 2. Clean 1881 NAPP ###

# Baseline sample
mola = read.dta13(paste0(fldr_napp, "MolaModified_NAPP_JAG4new.dta", sep=""))
mola = mola[mola$SEX=="Male",]
mola$AGE <- as.numeric(mola$AGE)
mola <- mola[mola$AGE>=15 & mola$AGE<=60, ]
mola <- mola[mola$MARST!="Unknown and NIU",]
mola <- mola[mola$nativity!="Unknown",]
mola <- mola[mola$hhhead==1,]

mola <- mola[, c("HHNUMhead", "hhhead", "RECIDGB", "social", "NAMELAST", "NAMEFRST", "AGE", "OCC81GB", "PARIDGB", "parish", "ADDRESS", "name", "name2", "new_address", "objectid_1")]

# Clean names
mola$NAMEFRST <- toupper(mola$NAMEFRST)
mola$NAMEFRST <- trim(mola$NAMEFRST)
mola$NAMELAST <- trim(mola$NAMELAST)
mola$NAMEFRST <- sapply(strsplit(as.character(mola$NAMEFRST),' '), "[", 1)
mola$MNAME <- sapply(strsplit(as.character(mola$NAMEFRST),' '), "[", 2)
mola$NAMEFRST <- trim(mola$NAMEFRST)
mola$MNAME <- trim(mola$MNAME)
mola$NAMELAST <- gsub("[^A-Za-z ]","",mola$NAMELAST)
mola$NAMELAST <- gsub('[[:punct:] ]+',' ',mola$NAMELAST)
mola$NAMEFRST <- gsub("[^A-Za-z ]","",mola$NAMEFRST)
mola$NAMEFRST <- gsub('[[:punct:] ]+',' ',mola$NAMEFRST)
mola <- mola[!is.na(mola$HHNUMhead),]

# Clean address
mola$new_address <- gsub("[^A-Za-z ]","",mola$new_address)
mola$new_address <- trim(mola$new_address)

### 3. Link I-CEM and NAPP dataset ###
### step 1: Perfect match on first and last names and map point ###

match <- merge(mola, dat, by.x=c("NAMELAST", "NAMEFRST", "objectid_1"), by.y=c("Sname", "Fname", "stid"), all=FALSE)

# max. 3 years difference in abs value
match$diff_age <- abs(as.numeric(match$Age) - as.numeric(match$AGE))
match <- match[match$diff_age<=3,]

# remove duplicates
match <- match[!is.na(match$HHNUMhead),]

match$x = 1
match <- data.table(match)
match <- match[, xx:=sum(x), by=c("HHNUMhead", "social")]
match <- data.frame(match)

match <- match[match$xx==1,]

match <- unique(match[,c("person_idH", "HHNUMhead", "social", "father_cat", "father_cat_1881", "father_cat_1851", "father_Age", "prev_father_Age", "prev_father_Occ", "father_Occ", "Occ_father_all")])

### step 2: Perfect match on street ID and fuzzy matching on first and last names ###

fuzzy <- mola[!(mola$HHNUMhead %in% match$HHNUMhead),]

fuzzy <- merge(fuzzy, dat, by.x="objectid_1", by.y="stid", all=FALSE)
fuzzy$name_dist <- stringdist(fuzzy$Sname, fuzzy$NAMELAST, method='jw', p=0.1)
fuzzy$name_dist2 <- stringdist(fuzzy$Fname, fuzzy$NAMEFRST, method='jw', p=0.1)

fuzzy <- data.table(fuzzy)
fuzzy <- fuzzy[, best_dist:=min(name_dist), by=c("HHNUMhead", "social")]
fuzzy <- fuzzy[, best_dist2:=min(name_dist2), by=c("HHNUMhead", "social")]
fuzzy <- data.frame(fuzzy)
fuzzy <- fuzzy[fuzzy$best_dist==fuzzy$name_dist,]
fuzzy <- fuzzy[fuzzy$best_dist2==fuzzy$name_dist2,]

fuzzy$diff_age <- abs(as.numeric(fuzzy$Age) - as.numeric(fuzzy$AGE))
fuzzy <- fuzzy[fuzzy$diff_age<=3,]

fuzzy <- fuzzy[!is.na(fuzzy$HHNUMhead),]

fuzzy$x = 1
fuzzy <- data.table(fuzzy)
fuzzy <- fuzzy[, xx:=sum(x), by=c("HHNUMhead", "social")]
fuzzy <- data.frame(fuzzy)

fuzzy <- fuzzy[fuzzy$xx==1,]

fuzzy <- unique(fuzzy[,c("person_idH", "HHNUMhead", "social", "father_cat", "father_cat_1881", "father_cat_1851", "father_Age", "prev_father_Age", "prev_father_Occ", "father_Occ", "Occ_father_all")])

### step 3: Perfect match on parish and surname and fuzzy on first name ###

fuzzy2 <- mola[!(mola$HHNUMhead %in% fuzzy$HHNUMhead),]
fuzzy2 <- fuzzy2[!(fuzzy2$HHNUMhead %in% match$HHNUMhead),]

fuzzy2 <- merge(fuzzy2, dat, by.x=c("PARIDGB", "NAMELAST"), by.y=c("Parish", "Sname"), all=FALSE)
fuzzy2$name_dist2 <- stringdist(fuzzy2$Fname, fuzzy2$NAMEFRST, method='jw', p=0.1)

fuzzy2 <- data.table(fuzzy2)
fuzzy2 <- fuzzy2[, best_dist2:=min(name_dist2), by=c("HHNUMhead", "social")]
fuzzy2 <- data.frame(fuzzy2)
fuzzy2 <- fuzzy2[fuzzy2$best_dist2==fuzzy2$name_dist2,]

fuzzy2$diff_age <- abs(as.numeric(fuzzy2$Age) - as.numeric(fuzzy2$AGE))
fuzzy2 <- fuzzy2[fuzzy2$diff_age<=3,]
fuzzy2 <- data.table(fuzzy2)
fuzzy2 <- fuzzy2[, min_age:=min(diff_age), by=c("HHNUMhead", "social")]
fuzzy2 <- data.frame(fuzzy2)
fuzzy2 <- fuzzy2[fuzzy2$min_age==fuzzy2$diff_age,]

fuzzy2 <- fuzzy2[!is.na(fuzzy2$HHNUMhead),]

fuzzy2$x = 1
fuzzy2 <- data.table(fuzzy2)
fuzzy2 <- fuzzy2[, xx:=sum(x), by=c("HHNUMhead", "social")]
fuzzy2 <- data.frame(fuzzy2)

fuzzy2 <- fuzzy2[fuzzy2$xx==1,]

fuzzy2 <- unique(fuzzy2[,c("person_idH", "HHNUMhead", "social", "father_cat", "father_cat_1881", "father_cat_1851", "father_Age", "prev_father_Age", "prev_father_Occ", "father_Occ", "Occ_father_all")])

### step 4: Perfect match on parish and age and fuzzy match on last and first name ###

fuzzy3 <- mola[!(mola$HHNUMhead %in% fuzzy2$HHNUMhead),]
fuzzy3 <- fuzzy3[!(fuzzy3$HHNUMhead %in% fuzzy$HHNUMhead),]
fuzzy3 <- fuzzy3[!(fuzzy3$HHNUMhead %in% match$HHNUMhead),]
fuzzy3 <- fuzzy3[!is.na(fuzzy3$HHNUMhead),]

fuzzy3 <- merge(fuzzy3, dat, by.x=c("PARIDGB", "AGE"), by.y=c("Parish", "Age"), all=FALSE)

fuzzy3$name_dist <- stringdist(fuzzy3$Sname, fuzzy3$NAMELAST, method='jw', p=0.1)
fuzzy3$name_dist2 <- stringdist(fuzzy3$Fname, fuzzy3$NAMEFRST, method='jw', p=0.1)

fuzzy3 <- data.table(fuzzy3)
fuzzy3 <- fuzzy3[, best_dist:=min(name_dist), by=c("HHNUMhead", "social")]
fuzzy3 <- fuzzy3[, best_dist2:=min(name_dist2), by=c("HHNUMhead", "social")]
fuzzy3 <- data.frame(fuzzy3)
fuzzy3 <- fuzzy3[fuzzy3$best_dist==fuzzy3$name_dist,]
fuzzy3 <- fuzzy3[fuzzy3$best_dist2==fuzzy3$name_dist2,]

fuzzy3$x = 1
fuzzy3 <- data.table(fuzzy3)
fuzzy3 <- fuzzy3[, xx:=sum(x), by=c("HHNUMhead", "social")]
fuzzy3 <- data.frame(fuzzy3)

fuzzy3 = fuzzy3[fuzzy3$xx==1,]

fuzzy3 <- unique(fuzzy3[,c("person_idH", "HHNUMhead", "social", "father_cat", "father_cat_1881", "father_cat_1851", "father_Age", "prev_father_Age", "prev_father_Occ", "father_Occ", "Occ_father_all")])

### step 5: Perfect match on parish and address and fuzzy match on last and first name ###

fuzzy4 <- mola[!(mola$HHNUMhead %in% fuzzy3$HHNUMhead),]
fuzzy4 <- fuzzy4[!(fuzzy4$HHNUMhead %in% fuzzy2$HHNUMhead),]
fuzzy4 <- fuzzy4[!(fuzzy4$HHNUMhead %in% fuzzy$HHNUMhead),]
fuzzy4 <- fuzzy4[!(fuzzy4$HHNUMhead %in% match$HHNUMhead),]
fuzzy4 <- fuzzy4[!is.na(fuzzy4$HHNUMhead),]

fuzzy4 <- merge(fuzzy4, dat, by.x=c("PARIDGB", "new_address"), by.y=c("Parish", "Address"), all=FALSE)

fuzzy4$name_dist <- stringdist(fuzzy4$Sname, fuzzy4$NAMELAST, method='jw', p=0.1)
fuzzy4$name_dist2 <- stringdist(fuzzy4$Fname, fuzzy4$NAMEFRST, method='jw', p=0.1)

fuzzy4 <- data.table(fuzzy4)
fuzzy4 <- fuzzy4[, best_dist:=min(name_dist), by=c("HHNUMhead", "social")]
fuzzy4 <- fuzzy4[, best_dist2:=min(name_dist2), by=c("HHNUMhead", "social")]
fuzzy4 <- data.frame(fuzzy4)
fuzzy4 <- fuzzy4[fuzzy4$best_dist==fuzzy4$name_dist,]
fuzzy4 <- fuzzy4[fuzzy4$best_dist2==fuzzy4$name_dist2,]

fuzzy4$diff_age <- abs(as.numeric(fuzzy4$Age) - as.numeric(fuzzy4$AGE))
fuzzy4 <- fuzzy4[fuzzy4$diff_age<=3,]
fuzzy4 <- data.table(fuzzy4)
fuzzy4 <- fuzzy4[, min_age:=min(diff_age), by=c("HHNUMhead", "social")]
fuzzy4 <- data.frame(fuzzy4)
fuzzy4 <- fuzzy4[fuzzy4$min_age==fuzzy4$diff_age,]

fuzzy4$x = 1
fuzzy4 <- data.table(fuzzy4)
fuzzy4 <- fuzzy4[, xx:=sum(x), by=c("HHNUMhead", "social")]
fuzzy4 <- data.frame(fuzzy4)

fuzzy4 = fuzzy4[fuzzy4$xx==1,]

fuzzy4 <- unique(fuzzy4[,c("person_idH", "HHNUMhead", "social", "father_cat", "father_cat_1881", "father_cat_1851", "father_Age", "prev_father_Age", "prev_father_Occ", "father_Occ", "Occ_father_all")])

### 4. Place all together ###

matched2 <- rbind(match, fuzzy, fuzzy2, fuzzy3, fuzzy4)
rm(match, fuzzy, fuzzy2, fuzzy3, fuzzy4)

matched <- matched[!is.na(matched$father_cat) & matched$father_cat!="" & matched$father_cat!=" ", c("HHNUMhead", "social", "father_cat")]

save(matched, file = paste(fldr_c, "father.RData", sep=""))

###############################################################
### 2. Match linked ICEM to NAPP based on HHNUMhead social  ###
### for sample of siblings in 1851
###############################################################

### 1. Clean linked 1851-1881 data ###

dat <- read_feather(file.path(fldr_a, "Dataset_1851_1881")

dat <- dat[!is.na(dat$sibling_idH) & dat$Year==1881,]
dat <- data.table(dat)
dat <- dat[, sib_num:=.N, by=sibling_idH]
dat <- data.frame(dat)
dat <- dat[dat$sib_num>1,] 

dat <- dat[, c("person_idH", "sibling_idH", "family_id","Sname", "Pname", "stid", "Age", "Parish", "Address")]

# Clean names
dat$Pname <- trim(dat$Pname)
dat$Sname <- trim(dat$Sname)
dat$Fname <- sapply(strsplit(as.character(dat$Pname),' '), "[", 1)
dat$Mname <- sapply(strsplit(as.character(dat$Pname),' '), "[", 2)
dat$Fname <- trim(dat$Fname)
dat$Mname <- trim(dat$Mname)
dat$Sname <- gsub("[^A-Za-z ]","",dat$Sname)
dat$Sname <- gsub('[[:punct:] ]+',' ',dat$Sname)
dat$Fname <- gsub("[^A-Za-z ]","",dat$Fname)
dat$Fname <- gsub('[[:punct:] ]+',' ',dat$Fname)

# Clean address
dat$Address <- tolower(dat$Address)
dat$Address <- gsub("[^A-Za-z ]","",dat$Address)
dat$Address <- trim(dat$Address)
dat$Parish <- as.character(dat$Parish)
dat$Address <- as.character(dat$Address)

### 2. NAPP data ###

mola = read.dta13(paste0(fldr_napp, "MolaModified_NAPP_JAG4new.dta", sep=""))
mola = mola[mola$SEX=="Male",]
mola$AGE <- as.numeric(mola$AGE)
mola <- mola[mola$AGE>=15 & mola$AGE<=60, ]
mola <- mola[mola$MARST!="Unknown and NIU",]
mola <- mola[mola$nativity!="Unknown",]
mola <- mola[mola$hhhead==1,]
mola <- mola[!is.na(mola$HHNUMhead),]

mola <- mola[, c("HHNUMhead", "hhhead", "RECIDGB", "social", "NAMELAST", "NAMEFRST", "AGE", "OCC81GB", "PARIDGB", "parish", "ADDRESS", "name", "name2", "new_address", "objectid_1")]

# clean names
mola$NAMEFRST <- toupper(mola$NAMEFRST)
mola$NAMEFRST <- trim(mola$NAMEFRST)
mola$NAMELAST <- trim(mola$NAMELAST)
mola$NAMEFRST <- sapply(strsplit(as.character(mola$NAMEFRST),' '), "[", 1)
mola$MNAME <- sapply(strsplit(as.character(mola$NAMEFRST),' '), "[", 2)
mola$NAMEFRST <- trim(mola$NAMEFRST)
mola$MNAME <- trim(mola$MNAME)
mola$NAMELAST <- gsub("[^A-Za-z ]","",mola$NAMELAST)
mola$NAMELAST <- gsub('[[:punct:] ]+',' ',mola$NAMELAST)
mola$NAMEFRST <- gsub("[^A-Za-z ]","",mola$NAMEFRST)
mola$NAMEFRST <- gsub('[[:punct:] ]+',' ',mola$NAMEFRST)

# clean address
mola$new_address <- gsub("[^A-Za-z ]","",mola$new_address)
mola$new_address <- trim(mola$new_address)

### 3. Linked I-CEM and NAPP ###
### step 1: Perfect match on first and last names and map point ###

match <- merge(mola, dat, by.x=c("NAMELAST", "NAMEFRST", "objectid_1"), by.y=c("Sname", "Fname", "stid"), all=FALSE)

match$diff_age <- abs(as.numeric(match$Age) - as.numeric(match$AGE))
match <- match[match$diff_age<=3,]
match <- match[!is.na(match$HHNUMhead),]

match <- unique(match[,c("person_idH", "HHNUMhead","sibling_idH", "family_id", "social")])

### step 2: Perfect match on street ID and fuzzy matching on first and last names ###

fuzzy <- mola[!(mola$HHNUMhead %in% match$HHNUMhead),]

fuzzy <- merge(fuzzy, dat, by.x="objectid_1", by.y="stid", all=FALSE)
fuzzy$name_dist <- stringdist(fuzzy$Sname, fuzzy$NAMELAST, method='jw', p=0.1)
fuzzy$name_dist2 <- stringdist(fuzzy$Fname, fuzzy$NAMEFRST, method='jw', p=0.1)

fuzzy <- data.table(fuzzy)
fuzzy <- fuzzy[, best_dist:=min(name_dist), by=c("HHNUMhead", "social")]
fuzzy <- fuzzy[, best_dist2:=min(name_dist2), by=c("HHNUMhead", "social")]
fuzzy <- data.frame(fuzzy)
fuzzy <- fuzzy[fuzzy$best_dist==fuzzy$name_dist,]
fuzzy <- fuzzy[fuzzy$best_dist2==fuzzy$name_dist2,]

fuzzy$diff_age <- abs(as.numeric(fuzzy$Age) - as.numeric(fuzzy$AGE))
fuzzy <- fuzzy[fuzzy$diff_age<=3,]

fuzzy <- fuzzy[!is.na(fuzzy$HHNUMhead),]

fuzzy$x = 1
fuzzy <- data.table(fuzzy)
fuzzy <- fuzzy[, xx:=sum(x), by=c("HHNUMhead", "social")]
fuzzy <- data.frame(fuzzy)

fuzzy <- fuzzy[fuzzy$xx==1,]

fuzzy <- unique(fuzzy[,c("person_idH", "HHNUMhead","sibling_idH", "family_id", "social")])

### step 3: Perfect match on parish and surname and fuzzy on first name ###

fuzzy2 <- mola[!(mola$HHNUMhead %in% fuzzy$HHNUMhead),]
fuzzy2 <- fuzzy2[!(fuzzy2$HHNUMhead %in% match$HHNUMhead),]

fuzzy2 <- merge(fuzzy2, dat, by.x=c("PARIDGB", "NAMELAST"), by.y=c("Parish", "Sname"), all=FALSE)
fuzzy2$name_dist2 <- stringdist(fuzzy2$Fname, fuzzy2$NAMEFRST, method='jw', p=0.1)

fuzzy2 <- data.table(fuzzy2)
fuzzy2 <- fuzzy2[, best_dist2:=min(name_dist2), by=c("HHNUMhead", "social")]
fuzzy2 <- data.frame(fuzzy2)
fuzzy2 <- fuzzy2[fuzzy2$best_dist2==fuzzy2$name_dist2,]

fuzzy2$diff_age <- abs(as.numeric(fuzzy2$Age) - as.numeric(fuzzy2$AGE))
fuzzy2 <- fuzzy2[fuzzy2$diff_age<=3,]
fuzzy2 <- data.table(fuzzy2)
fuzzy2 <- fuzzy2[, min_age:=min(diff_age), by=c("HHNUMhead", "social")]
fuzzy2 <- data.frame(fuzzy2)
fuzzy2 <- fuzzy2[fuzzy2$min_age==fuzzy2$diff_age,]

fuzzy2 <- fuzzy2[!is.na(fuzzy2$HHNUMhead),]

fuzzy2$x = 1
fuzzy2 <- data.table(fuzzy2)
fuzzy2 <- fuzzy2[, xx:=sum(x), by=c("HHNUMhead", "social")]
fuzzy2 <- data.frame(fuzzy2)

fuzzy2 <- fuzzy2[fuzzy2$xx==1,]

fuzzy2 <- unique(fuzzy2[,c("person_idH", "HHNUMhead","sibling_idH", "family_id", "social")])

### step 4: Perfect match on parish and age and fuzzy match on last and first name ###

fuzzy3 <- mola[!(mola$HHNUMhead %in% fuzzy2$HHNUMhead),]
fuzzy3 <- fuzzy3[!(fuzzy3$HHNUMhead %in% fuzzy$HHNUMhead),]
fuzzy3 <- fuzzy3[!(fuzzy3$HHNUMhead %in% match$HHNUMhead),]
fuzzy3 <- fuzzy3[!is.na(fuzzy3$HHNUMhead),]

fuzzy3 <- merge(fuzzy3, dat, by.x=c("PARIDGB", "AGE"), by.y=c("Parish", "Age"), all=FALSE)

fuzzy3$name_dist <- stringdist(fuzzy3$Sname, fuzzy3$NAMELAST, method='jw', p=0.1)
fuzzy3$name_dist2 <- stringdist(fuzzy3$Fname, fuzzy3$NAMEFRST, method='jw', p=0.1)

fuzzy3 <- data.table(fuzzy3)
fuzzy3 <- fuzzy3[, best_dist:=min(name_dist), by=c("HHNUMhead", "social")]
fuzzy3 <- fuzzy3[, best_dist2:=min(name_dist2), by=c("HHNUMhead", "social")]
fuzzy3 <- data.frame(fuzzy3)
fuzzy3 <- fuzzy3[fuzzy3$best_dist==fuzzy3$name_dist,]
fuzzy3 <- fuzzy3[fuzzy3$best_dist2==fuzzy3$name_dist2,]

fuzzy3$x = 1
fuzzy3 <- data.table(fuzzy3)
fuzzy3 <- fuzzy3[, xx:=sum(x), by=c("HHNUMhead", "social")]
fuzzy3 <- data.frame(fuzzy3)

fuzzy3 = fuzzy3[fuzzy3$xx==1,]

fuzzy3 <- unique(fuzzy3[,c("person_idH", "HHNUMhead","sibling_idH", "family_id", "social")])

# step 5: Perfect match on parish and address and fuzzy match on last and first name ###

fuzzy4 <- mola[!(mola$HHNUMhead %in% fuzzy3$HHNUMhead),]
fuzzy4 <- fuzzy4[!(fuzzy4$HHNUMhead %in% fuzzy2$HHNUMhead),]
fuzzy4 <- fuzzy4[!(fuzzy4$HHNUMhead %in% fuzzy$HHNUMhead),]
fuzzy4 <- fuzzy4[!(fuzzy4$HHNUMhead %in% match$HHNUMhead),]
fuzzy4 <- fuzzy4[!is.na(fuzzy4$HHNUMhead),]

fuzzy4 <- merge(fuzzy4, dat, by.x=c("PARIDGB", "new_address"), by.y=c("Parish", "Address"), all=FALSE)

fuzzy4$name_dist <- stringdist(fuzzy4$Sname, fuzzy4$NAMELAST, method='jw', p=0.1)
fuzzy4$name_dist2 <- stringdist(fuzzy4$Fname, fuzzy4$NAMEFRST, method='jw', p=0.1)

fuzzy4 <- data.table(fuzzy4)
fuzzy4 <- fuzzy4[, best_dist:=min(name_dist), by=c("HHNUMhead", "social")]
fuzzy4 <- fuzzy4[, best_dist2:=min(name_dist2), by=c("HHNUMhead", "social")]
fuzzy4 <- data.frame(fuzzy4)
fuzzy4 <- fuzzy4[fuzzy4$best_dist==fuzzy4$name_dist,]
fuzzy4 <- fuzzy4[fuzzy4$best_dist2==fuzzy4$name_dist2,]

fuzzy4$diff_age <- abs(as.numeric(fuzzy4$Age) - as.numeric(fuzzy4$AGE))
fuzzy4 <- fuzzy4[fuzzy4$diff_age<=3,]
fuzzy4 <- data.table(fuzzy4)
fuzzy4 <- fuzzy4[, min_age:=min(diff_age), by=c("HHNUMhead", "social")]
fuzzy4 <- data.frame(fuzzy4)
fuzzy4 <- fuzzy4[fuzzy4$min_age==fuzzy4$diff_age,]

fuzzy4$x = 1
fuzzy4 <- data.table(fuzzy4)
fuzzy4 <- fuzzy4[, xx:=sum(x), by=c("HHNUMhead", "social")]
fuzzy4 <- data.frame(fuzzy4)

fuzzy4 = fuzzy4[fuzzy4$xx==1,]

fuzzy4 <- unique(fuzzy4[,c("person_idH", "HHNUMhead","sibling_idH", "family_id", "social")])

### 4. Save data ###

matched <- rbind(match, fuzzy, fuzzy2, fuzzy3, fuzzy4)

matched <- matched[, c("HHNUMhead", "social", "family_id")]

save(matched, file = paste(fldr_c, "sibling.RData", sep=""))





